Title: “DEG Analysis Code Annotation”
Author: Annika Jorgensen
Date: 02/02/2022
Purpose: This document is for the author to demonstrate understanding of the “DEG_analysis_changed_comparison” code and theory.
The first chunk of code is dedicated to installing the libraries. These libraries are to help execute the differential analysis and helps visualize the data.
#==================================
# Markdown Libraries
#==================================
#==================================
# DEG Analysis Libraries
#==================================
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 4.1.3
library(matrixStats)
## Warning: package 'matrixStats' was built under R version 4.1.3
library(ggplot2)
library(edgeR)
## Loading required package: limma
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(limma)
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.1.3
## Loading required package: foreach
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.1.3
## Loading required package: parallel
library(variancePartition)
## Loading required package: BiocParallel
##
## Attaching package: 'variancePartition'
## The following object is masked from 'package:limma':
##
## classifyTestsF
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
##
library(clusterProfiler)
##
## Registered S3 method overwritten by 'ggtree':
## method from
## identify.gg ggfun
## clusterProfiler v4.2.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(GOSemSim)
## GOSemSim v2.20.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use GOSemSim in published research, please cite:
## [36m-[39m Guangchuang Yu. Gene Ontology Semantic Similarity Analysis Using GOSemSim. In: Kidder B. (eds) Stem Cell Transcriptional Networks. Methods in Molecular Biology, 2020, 2117:207-215. Humana, New York, NY. doi:10.1007/978-1-0716-0301-7_11
## [36m-[39m Guangchuang Yu, Fei Li, Yide Qin, Xiaochen Bo, Yibo Wu, Shengqi Wang. GOSemSim: an R package for measuring semantic similarity among GO terms and gene products Bioinformatics 2010, 26(7):976-978. doi:10.1093/bioinformatics/btq064
library(biomaRt)
library(UpSetR)
library(VennDiagram)
## Warning: package 'VennDiagram' was built under R version 4.1.3
## Loading required package: grid
## Loading required package: futile.logger
library(ggrepel)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(forcats)
library(stats)
setwd('~/R')
This next section of code is dedicated to the environmental parameters. Environmental parameters are a series of variables and other code that will help make the rest of the script be easier to make and run later on.
A working directory is a code that iterates a file path on your computer th.t sets where the default location of any files that you read into R. Working directories work different in R files than R Markdowns. R Markdown files require directories to be defined at the end of each code chunk. Meaning from here on out you will see working directories being defined at the end of each code chunk.
setwd('~/R')
This chunk defines color palette variables that are going to be used in plots later on the script. These variables are defined by conversiting BrewerCode palettes into palettes that can be used in R.
viralPalette <- brewer.pal(8, "Set1")
hbvColor <- viralPalette[1]
hcvColor <- viralPalette[2]
bothColor <- viralPalette[3]
neitherColor <- viralPalette[4]
sexTissuePalette <- brewer.pal(12, "Paired")
maleTumorColor <- sexTissuePalette[4]
maleAdjacentColor <- sexTissuePalette[3]
femaleTumorColor <- sexTissuePalette[6]
femaleAdjacentColor <- sexTissuePalette[5]
setwd('~/R')
This code is where you read in all the data files that are going to be used in the script. The data is also converted into a variety of variables that makes the data easier to handle. The data is also cleaned up to make sure the analysis done later is accurate and precise.
metadata <- read.table("metadata_for_de.csv", row.names=1,header=TRUE, sep=",") #changing the name of the file
tumorAdjacentExp <- read.table("japan_all_samples_salmon_expression_counts.txt", row.names = 1, header=TRUE) #changing the name of the file
colnames(tumorAdjacentExp) <- gsub("\\.", "-", colnames(tumorAdjacentExp)) #changing the column names
This next code chunk is very similar However, it does calculate gene length which is the done by first defining a variable named “gene”” and then changing the data type to a data frame. You then redefine “tumorAdjacentExp” (defined above) to have the rows of the previous “tumorAdjacentExp” and then the columns of “GENEID”” that lies within “gene”.
Gene length is then defined to have “with” of genes in the rows and ‘end-start’ as a column
genes <- read.table("gencodeTranscripts.txt", header=TRUE, sep="\t")
genes <- data.frame(genes)
tumorAdjacentExp <- tumorAdjacentExp[rownames(tumorAdjacentExp) %in% genes$GENEID ,]
genes <- genes[match(rownames(tumorAdjacentExp), genes$GENEID),]
# Calculating gene length, this is needed for calculating the FPKM values
genes$length <- with(genes, end - start)
The next line shows a sample being removed due to low quality.
metadata<-metadata[!(metadata$ID == "RK023"), ]
This next chunk of data is dedicated to sub-setting and organizing the data to make it easier to use going forward. Sub-setting means that the data is being organized to match a count matrix. In this specific case the count matrix is the sample ID attached to the tumors.
tumorAdjacentExpSubset <- tumorAdjacentExp[,colnames(tumorAdjacentExp) %in% metadata$sampleid]
metadataSubset <- metadata[metadata$sampleid %in% colnames(tumorAdjacentExpSubset),]
metadataSubset <- metadataSubset[match(colnames(tumorAdjacentExpSubset), metadataSubset$sampleid),]
rownames(metadataSubset) <- metadataSubset$sampleid
This next chunk of data is taking the meta data and subsetting it in such a way that converts a series of categorical variables into factors. This data also adds a tissue type.
metadataSubset$tumor <- as.numeric(grepl('tumor', metadataSubset$sampleid, ignore.case=T))
metadataSubset$gender_tissue <- paste(metadataSubset$Gender, metadataSubset$tumor, sep="_")
metadataSubset$gender_tissue_viral <- paste(metadataSubset$gender_tissue, metadataSubset$Virus_infection, sep="_")
metadataSubset$library_type <- metadataSubset$strandedness
metadataSubset$library_type <- factor(metadataSubset$library_type)
metadataSubset$tumor <- factor(metadataSubset$tumor)
metadataSubset$Ta <- factor(metadataSubset$Ta)
metadataSubset$Portal_vein_invasion <- factor(metadataSubset$Portal_vein_invasion)
metadataSubset$Hepatic_vein_invasion <- factor(metadataSubset$Hepatic_vein_invasion)
metadataSubset$Bile_duct_invasion <- factor(metadataSubset$Bile_duct_invasion)
metadataSubset$Liver_fibrosisc <- factor(metadataSubset$Liver_fibrosisc)
metadataSubset$Prognosis <- factor(metadataSubset$Prognosis)
This next chunk of code creates something called a DGEList Object. This object contains a dataset that is to be analyzed later in the script. Specifically the object contains:
1. Counts– numeric matrix containing read counts
2. group– vector giving the experimental conditiona for each sample
3. genes– data frame information for the genes for which we have count data
4. remove.zeros– whether to remove rows that have 0 total count
The last line of code takes the amount of samples and places them into a table for easy read and inspection.
dge <- DGEList(counts=tumorAdjacentExpSubset, genes=genes)
colnames(dge) <- colnames(tumorAdjacentExpSubset)
dge$samples$sex <- metadataSubset$Gender
dge$samples$viral <- factor(metadataSubset$Virus_infection)
dge$samples$ID <- metadataSubset$ID
dge$samples$tumor <- metadataSubset$tumor
dge$samples$gender_tissue <- metadataSubset$gender_tissue
dge$samples$gender_tissue_viral <- metadataSubset$gender_tissue_viral
dge$samples$library_type <- metadataSubset$library_type
dge$samples$edmonson_grade <- metadataSubset$Edmondson_grade
dge$samples$Ta <- metadataSubset$Ta
dge$samples$survival <- metadataSubset$Overall_survival_month
##======================
#Inspecting the N of samples in each group
table(dge$samples$gender_tissue_viral)
##
## F_0_HBV F_0_HCV F_0_NBNC F_1_HBV F_1_HCV F_1_NBNC M_0_both M_0_HBV
## 8 34 3 9 36 3 4 33
## M_0_HCV M_0_NBNC M_1_both M_1_HBV M_1_HCV M_1_NBNC
## 59 25 4 40 71 25
This chunk of code takes the fpkm of all the genes in the dataset and calculates the mean. They also filters out genes that have a fpkm of 0.5 or lower.
Fpkm stands for fragments per kilo base of exon per million this term is interchangable with Rpkm (reads per kilobase of exon per million. This measure is a normalization method which allows us to compare gene expression levels. This is done by rescaling both library size and gene length.
Fpkm is calculated by multiplying the number of reads mapped to a gene by 1,000 and then 1,000,000 and then dividing that number by the total number of mapped reads to gene length in base pairs.
Please note that that calculation is done for RPKM which is analogous to Fpkm.
# ======================================
# Filtering expression data
# ======================================
# Keeping genes that have a mean FPKM of at least 0.5 in at least one of the
# groups under investigation and at least 6 reads in at least 10 samples
fpkm <- rpkm(dge, gene.length=dge$genes$length)
Here the fpkm is calculated from all the various tissue samples and then are filtered for greater than 0.5 and put into a variable named “keep”. This established a cutoff point that is going to be used in limma/voom analysis
M_1_HBV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_1_HBV")],1,mean,na.rm=TRUE)
M_0_HBV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_0_HBV")],1,mean,na.rm=TRUE)
M_1_HCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_1_HCV")],1,mean,na.rm=TRUE)
M_0_HCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_0_HCV")],1,mean,na.rm=TRUE)
M_1_HBVHCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_1_both")],1,mean,na.rm=TRUE)
M_0_HBVHCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_0_both")],1,mean,na.rm=TRUE)
M_1_NBNC_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_1_NBNC")],1,mean,na.rm=TRUE)
M_0_NBNC_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="M_0_NBNC")],1,mean,na.rm=TRUE)
F_1_HBV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_1_HBV")],1,mean,na.rm=TRUE)
F_0_HBV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_0_HBV")],1,mean,na.rm=TRUE)
F_1_HCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_1_HCV")],1,mean,na.rm=TRUE)
F_0_HCV_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_0_HCV")],1,mean,na.rm=TRUE)
F_1_NBNC_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_1_NBNC")],1,mean,na.rm=TRUE)
F_0_NBNC_mean_fpkm <- apply(as.data.frame(fpkm)[(dge$samples$gender_tissue_viral=="F_0_NBNC")],1,mean,na.rm=TRUE)
keep <- (M_1_HBV_mean_fpkm > 0.5 | M_0_HBV_mean_fpkm > 0.5 |
M_1_HCV_mean_fpkm > 0.5 | M_0_HCV_mean_fpkm > 0.5 |
M_1_HBVHCV_mean_fpkm > 0.5 | M_0_HBVHCV_mean_fpkm > 0.5 |
M_1_NBNC_mean_fpkm > 0.5 | M_0_NBNC_mean_fpkm > 0.5 |
F_1_HBV_mean_fpkm > 0.5 | F_0_HBV_mean_fpkm > 0.5 |
F_1_HCV_mean_fpkm > 0.5 | F_0_HCV_mean_fpkm > 0.5 |
F_1_NBNC_mean_fpkm > 0.5 | F_0_NBNC_mean_fpkm > 0.5)
This chunk is further organizing and counting the libraries to be more tangible for later on.
This chunk is also calculating the normalization factors (not normalizing the data) to use later on in the limma/voom DEG.
The normalization factors are calculated using Trimmed Mean of M-values (TMM). TMM is a between sample normalization that assumes that most genes are not differentially expressed. TMM normalizes the total RNA output among the samples, not considering gene length nor library size. TMM does consider RNA population which makes it effective with samples that have diverse RNA repertoires.
TMM is calculated by taking the library size normalized read count for each gene in each sample and calculates the log2 fold change between two samples (M-value). From there you calculate the absolute expression count (A values) which is the sum of the log2 fold change of treated sample count plus the log2 fold change of the control sample count divided by two.
You then double trim the upper and lower percentages of the data. Trimming the M-values by 30% and the A values by 5%. You then get the weight mean M after tramming and calculate the normalization factor.
dge <- dge[keep,,keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge, method="TMM")
keep <- rowSums(dge$counts > 6) >= 10
dge <- dge[keep,,keep.lib.size=FALSE]
dge <- calcNormFactors(dge, method="TMM")
This code counts the number of genes that make it past the cutoff point.
# N of genes retained after filtering
dim(dge$genes)
## [1] 13384 7
This code chunk is creating a design matrix to analyze tumor vs. tumor adjacent regardless of sex. This design matrix creates a model matrix that takes the inputs that we want to consider for later limma/voom analysis.
In this specific design matrix we are considering all tumor samples and all tumor adjacent samples.
The colnames code is just renaming columns of the design matrix to identify what they are for easier readability.
# ===========================================================
# ===========================================================
# Analysis of all tumor vs. tumor-adjacent regardless of sex
# ===========================================================
# ===========================================================
# Creating a new design model matrix with the variable of interest and the
# library type
design <- model.matrix(~0+dge$samples$tumor+dge$samples$library_type+dge$samples$Ta)
#I just put design matrix in twice because I didn't remember what inputs to put in.
identical(design,design,num.eq=TRUE)
## [1] TRUE
colnames(design) <- gsub("dge\\$samples\\$tumor", "tumor", colnames(design))
colnames(design) <- gsub("dge\\$samples\\$library_typeunstranded", "library_type", colnames(design))
colnames(design) <- gsub("dge\\$samples\\$Ta2", "Ta2", colnames(design))
colnames(design) <- gsub("dge\\$samples\\$Ta3", "Ta3", colnames(design))
colnames(design) <- gsub("dge\\$samples\\$Ta4", "Ta4", colnames(design))
head(design)
## tumor0 tumor1 library_type Ta2 Ta3 Ta4
## 1 1 0 1 1 0 0
## 2 0 1 1 1 0 0
## 3 1 0 1 0 0 1
## 4 0 1 1 0 0 1
## 5 1 0 1 1 0 0
## 6 0 1 1 1 0 0
voom is a function that lies within a package called limma. limma/voom is used in DGE analysis. voom is a function that takes the counts in a metadata set and transforms them into log2 of TMM values calculated above in the normalization factors. A linear model is then fitted to the TMM for each gene and residuals are calculated. A smoothed curve is then fitted to the square root of the residual standard deviation by the average expression (this is the red line). This smooth curve is then used to obtain weights for each gene and sample that are passed into limma along with the TMM values.
# Running voom again with the new design matrix.
v <- voomWithQualityWeights(dge, design, plot=TRUE)
This graph is an MDS plot of female, and male tumor and tumor-adjacent samples sorted by tissue type. An MDS plot is just a two dimensional space to arrange points. However, the points are arranged in a way so that the distances between the points correlate with dissimilarity between two samples.
#======================================
#Multi-dimensional scaling plot
#======================================
#MDS plot. The dimensions are in the mds object.
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/MDS_figure_comp1_2.pdf")
mds <- plotMDS(v, top = 50, dim.plot = c(1,2), plot=TRUE, cex=2,
pch=ifelse(v$targets$viral %in% c("HBV"), 17,
ifelse(v$targets$viral %in% c("HCV"), 15,
ifelse(v$targets$viral %in% c("both"), 16, 3))),
col=ifelse(v$targets$gender_tissue=="M_1", maleTumorColor,
ifelse(v$targets$gender_tissue=="M_0", maleAdjacentColor,
ifelse(v$targets$gender_tissue=="F_1", femaleTumorColor,
ifelse(v$targets$gender_tissue=="F_0", femaleAdjacentColor, "azure3")))),
gene.selection = "common") #, gene.selection = "pairwise", labels=tissue,
legend("topleft", pch=c(15),
col=c(maleTumorColor, maleAdjacentColor, femaleTumorColor, femaleAdjacentColor),
legend=c("Male tumor", "Male adjacent", "Female tumor", "Female adjacent"))
legend("topright", pch=c(17, 15, 16, 3),
col=c("black"),
legend=c("HBV", "HCV", "Both", "Neither"))
#dev.off()
This graph is an MDS plot of female and male tumor and tumor-adjacent samples sorted by library type.
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/MDS_figure_comp2_3_colored_by_lib_type.pdf")
mds <- plotMDS(v, top = 50, dim.plot = c(1,2), plot=TRUE, cex=2,
pch=ifelse(v$targets$viral %in% c("HBV"), 17,
ifelse(v$targets$viral %in% c("HCV"), 15,
ifelse(v$targets$viral %in% c("both"), 16, 3))),
col=ifelse(v$targets$library_type=="stranded", "red",
ifelse(v$targets$library_type=="unstranded", "blue", "black")),
gene.selection = "common")
legend("topleft", pch=c(15),
col=c(maleTumorColor, maleAdjacentColor, femaleTumorColor, femaleAdjacentColor),
legend=c("Male tumor", "Male adjacent", "Female tumor", "Female adjacent"))
legend("topright", pch=c(17, 15, 16, 3),
col=c("black"),
legend=c("HBV", "HCV", "Both", "Neither"))
#dev.off()
This graph is an MDS plot of female, male tumor samples and tumor-adjacent samples sorted by gender and tissue sample.
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/MDS_figure_comp2_3.pdf")
mds <- plotMDS(v, top = 50, dim.plot = c(2,3), plot=TRUE, cex=2,
pch=ifelse(v$targets$viral %in% c("HBV"), 17,
ifelse(v$targets$viral %in% c("HCV"), 15,
ifelse(v$targets$viral %in% c("both"), 16, 3))),
col=ifelse(v$targets$gender_tissue=="M_1", maleTumorColor,
ifelse(v$targets$gender_tissue=="M_0", maleAdjacentColor,
ifelse(v$targets$gender_tissue=="F_1", femaleTumorColor,
ifelse(v$targets$gender_tissue=="F_0", femaleAdjacentColor, "azure3")))),
gene.selection = "common") #, gene.selection = "pairwise", labels=tissue,
legend("topleft", pch=c(15),
col=c(maleTumorColor, maleAdjacentColor, femaleTumorColor, femaleAdjacentColor),
legend=c("Male tumor", "Male adjacent", "Female tumor", "Female adjacent"))
legend("topright", pch=c(17, 15, 16, 3),
col=c("black"),
legend=c("HBV", "HCV", "Both", "Neither"))
#dev.off()
This chunk of code is removing a batch effect. Batch effects are non-biological effects in an experiment that can changes in the data produced.
The batch effect being removed in this case is the library type. Some of the data was single stranded while others were double stranded causing a wide dissimilarity in the data and MDS plot.
# Removing batch effects
vCorrectLibtype <- removeBatchEffect(v, batch=v$targets$library_type)
This is the MDS plot without the batch effect in the dataset.
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/MDS_figure_comp1_2_afteradj.pdf")
mds <- plotMDS(vCorrectLibtype, top = 50, ndim=10, dim.plot=c(1,2), plot=TRUE, cex=2,
pch=ifelse(v$targets$viral %in% c("HBV"), 17,
ifelse(v$targets$viral %in% c("HCV"), 15,
ifelse(v$targets$viral %in% c("both"), 16, 3))),
col=ifelse(v$targets$gender_tissue=="M_1",
maleTumorColor,
ifelse(v$targets$gender_tissue=="M_0",
maleAdjacentColor,
ifelse(v$targets$gender_tissue=="F_1",
femaleTumorColor, femaleAdjacentColor))),
gene.selection = "common")
## Warning in plot.window(...): "ndim" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "ndim" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ndim" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "ndim" is not a
## graphical parameter
## Warning in box(...): "ndim" is not a graphical parameter
## Warning in title(...): "ndim" is not a graphical parameter
legend("topleft", pch=c(15),
col=c(maleTumorColor, maleAdjacentColor, femaleTumorColor, femaleAdjacentColor),
legend=c("Male tumor", "Male adjacent", "Female tumor", "Female adjacent"))
legend("topright", pch=c(17, 15, 16, 3),
col=c("black"),
legend=c("HBV", "HCV", "Both", "Neither"))
#dev.off()
This next code chunk is creating variables needed to do a Principal Component Analysis a.k.a PCA. PCA is a dimensionality-reduction method that increases the interpretability of a large dataset while minimizing the amount of information loss.
In order to do PCA you need to create a covariance matrix. A covariance matrix is a symmetric matrix where the dimensions are the number of variables in your data set. The entries of the matrix are the covariances associated will all possible pairs of initial variables. This code chunk is creating a covariance matrix with the voom transformed counts.
#======================================
#PCA
#======================================
# Select most variable genes based on coefficient of variance (mean scaled)
# Voom transformed counts
voomCounts <- v$E
voomCountsMatrix <- data.matrix(voomCounts, rownames.force = NA)
ntop = length(dge$genes$TXNAME)
means <- rowMeans(voomCountsMatrix)
Pvars <- rowVars(voomCountsMatrix)
cv2 <- Pvars/means^2
select <- order(cv2, decreasing = TRUE)[seq_len(min(ntop, length(cv2)))]
head(select)
## [1] 13325 12625 11994 11067 11196 11554
highly_variable_exp <- ((voomCountsMatrix)[select, ])
dim(highly_variable_exp)
## [1] 13384 354
In order to do PCA you need to standardize your original dataset (seen in lines 388-390). To standardize a dataset you take a value and subtract the mean of the dataset and then divide by the standard deviation of the dataset.
This chunk is actually running the PCA which means that after creating the covariance matrix you find the eigenvectors and the eigenvalues of the matrix. The eigenvectors become your Principal Components and the eigenvalues are the variance of the principal components. These principal components are then reoriented from the original axes to the principal component axes.
# Running PCA
pca_exp <- prcomp(t(highly_variable_exp), scale=T, center=T)
head(pca_exp$t)
## NULL
This next code chunk is creating a new data frame with the first ten Principal Components. This dataframe is called a feature vector. This feature vector is important because PCA allows us to increase impretability of big datasets by reducing dimensionality of large matrices while also minimizing the amount of information loss.
# Dataframe with the first 10 PCs
dim1_10 <- data.frame(pca_exp$x[,1:10])
This next code chunk is a second PCA being done with the metadata
# Adding metadata
pcaWithMetadata <- merge(dim1_10, metadataSubset, by=0, all=TRUE)
pcaWithMetadata$Virus.infection <- factor(pcaWithMetadata$Virus_infection,
levels=c("HBV", "HCV", "both", "NBNC", NA))
pcaWithMetadata$gender_tissue <- factor(pcaWithMetadata$gender_tissue,
levels=c("M_1", "M_0", "F_1","F_0"))
This code chunk the PCA data is being plotted. The axes are two of the Principal components and then the data in the PCA done with the metadata This plot shows a much closer correlation between tumor and tumor-adjacent data.
# Plotting
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/PCA_with_metadata.pdf", width=10, height=10)
ggplot(data=pcaWithMetadata, aes(x = PC1, y = PC2, shape = Virus.infection,
color = gender_tissue)) +
geom_point(size = 8) +
theme_bw() +
scale_color_manual(values=c(maleTumorColor, maleAdjacentColor,
femaleTumorColor, femaleAdjacentColor,
"azure3")) +
theme(plot.title = element_text(size = 12, face = "bold"),
legend.title=element_text(size = 30),
legend.text=element_text(size = 30),
axis.text.x = element_text(size = 30),
axis.text.y = element_text(size = 30),
axis.title.x = element_text(size = 22),
axis.title.y = element_text(size = 22)) +
guides(color = guide_legend(order = 2),
shape = guide_legend(order = 1)) +
theme(legend.title = element_blank()) +
xlab("PC1") +
ylab("PC2")
#dev.off()
This next code chunk does a variance partition to show various to show how certain effects such has alcohol intake and smoking contribute to the results seen in the data. This chunk specifically specifies what variables to consider when doing the variance partition.
# ======================================
# Variance explained
# ======================================
# Variance explained by variancePartition
metadataSubset$Alcohol_intake <- as.factor(metadataSubset$Alcohol_intake)
metadataSubset$Smoking <- as.factor(metadataSubset$Smoking)
# Specifying variables to consider
form <- ~
Tumor_size_mm +
Overall_survival_month +
(1|ID) +
(1|tumor) +
(1|Gender) +
(1|Virus_infection) +
(1|Ta) +
(1|Edmondson_grade) +
(1|Portal_vein_invasion) +
(1|Hepatic_vein_invasion) +
(1|Bile_duct_invasion) +
(1|Liver_fibrosisc) +
(1|Alcohol_intake) +
(1|Smoking) +
(1|Prognosis)
This code chunk is actually does the variable partition. Subsequently the partition is fitted to a linear model and then plotted so we can see visually how the variance of each effect influences the overall data.
# Running parallel
cl <- makeCluster(6)
registerDoParallel(cl)
# Fitting the linear model
library(variancePartition)
varPart <- fitExtractVarPartModel(voomCounts, form, metadataSubset)
## Warning in .fitExtractVarPartModel(exprObj, formula, data, REML = REML, : Variables contain NA's: Edmondson_grade
## Samples with missing data will be dropped.
## Dividing work into 102 chunks...
##
## Total:3304 s
# Sorting variables by median fraction of variance explained
vp <- sortCols(varPart)
# Violin plot of contribution of each variable to total variance
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG FIgures/VarPart.pdf", width=10, height=10)
plotVarPart(vp)
#dev.off()
# Stop parallel
registerDoSEQ()
stopCluster(cl)
This code chunk is running a Wilcoxon test to see if male and female grades should be added as cofactors. A Wilcoxon test determines if two or more sets of pairs are different from another another in a statistically significant manner. If there is a statistically significant different then they should be added as potential cofactors.
# ======================================
# Checking male vs. female grade and stage to see if these should be added as cofactors
# ======================================
metadataSubset_female <- metadataSubset[which(metadataSubset$Gender=="F"),]
metadataSubset_male <- metadataSubset[which(metadataSubset$Gender=="M"),]
#metadataSubset_male$Ta <- log(as.numeric(metadataSubset_male$Ta), 10)
#metadataSubset_female$Ta <- log(as.numeric(metadataSubset_female$Ta),10)
boxplot(metadataSubset_female$Ta, metadataSubset_male$Ta)
#t.test(as.numeric(metadataSubset_female$Ta), as.numeric(metadataSubset_male$Ta), alternative = "two.sided", paired=FALSE)
wilcox.test(as.numeric(metadataSubset_female$Ta), as.numeric(metadataSubset_male$Ta))
##
## Wilcoxon rank sum test with continuity correction
##
## data: as.numeric(metadataSubset_female$Ta) and as.numeric(metadataSubset_male$Ta)
## W = 8861, p-value = 2.959e-05
## alternative hypothesis: true location shift is not equal to 0
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/Survival/Tumor_stage_boxplot.pdf")
boxplot(metadataSubset_female$Ta, metadataSubset_male$Ta, names=c("Female", "Male"))
#dev.off()
metadataSubset_female$Edmondson_grade <- gsub("1~2", "1.5", metadataSubset_female$Edmondson_grade)
metadataSubset_female$Edmondson_grade <- gsub("2~3", "2.5", metadataSubset_female$Edmondson_grade)
metadataSubset_female$Edmondson_grade <- as.numeric(metadataSubset_female$Edmondson_grade)
metadataSubset_male$Edmondson_grade <- gsub("1~2", "1.5", metadataSubset_male$Edmondson_grade)
metadataSubset_male$Edmondson_grade <- gsub("2~3", "2.5", metadataSubset_male$Edmondson_grade)
metadataSubset_male$Edmondson_grade <- as.numeric(metadataSubset_male$Edmondson_grade)
boxplot(metadataSubset_female$Edmondson_grade, metadataSubset_male$Edmondson_grade)
t.test(metadataSubset_female$Edmondson_grade, metadataSubset_male$Edmondson_grade, alternative = "two.sided", paired=FALSE)
##
## Welch Two Sample t-test
##
## data: metadataSubset_female$Edmondson_grade and metadataSubset_male$Edmondson_grade
## t = -0.61404, df = 184.69, p-value = 0.5399
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.15286858 0.08029831
## sample estimates:
## mean of x mean of y
## 2.069892 2.106178
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/Survival/Edmonson_grade_boxplot.pdf")
boxplot(metadataSubset_female$Edmondson_grade, metadataSubset_male$Edmondson_grade, names=c("Female", "Male"))
#dev.off()
This sections marks the beginning of running limma. This section is dedicated to creating a linear fit to the data, making comparisons of the fitted data, and them apply Bayes smoothing. This first code chunk involves creating a variable that has all of the duplicate correlation values on v and design. These correlation values will be used later in a linear fit.
# ============================================================================================
# Differential expression analysis with limma - all male tumor adjacent vs. non-tumor-adjacent
# ===========================================================================================
# Block design for individual. This is used in tumor-normal comparisons with
# paired samples.
corfit <- duplicateCorrelation(v, design, block = v$targets$ID)
# This should give a positive correlation value. It represents the
# correlation between measurements made on the same person.
corfit$consensus
## [1] 0.1595651
This is the linear model with limma notice that the correlation values were the duplicate correlations used earlier
# Fitting the linear model with limma.
# If using paired samples, the within-patient correlation and a block design
# for patient is used to account for pairwise samples
fit <- lmFit(v, design, block = v$targets$ID, correlation = corfit$consensus)
This code chunk involves extracting coefficients from the linear fit model and storing them in a vector for later use.
# Contrast design for differential expression
# Defining pairwise comparisons
contrasts <- makeContrasts(Adjacent_vs_Tumor = tumor1 - tumor0,
levels=colnames(design))
head(contrasts)
## Contrasts
## Levels Adjacent_vs_Tumor
## tumor0 -1
## tumor1 1
## library_type 0
## Ta2 0
## Ta3 0
## Ta4 0
# Assigning all comparisons to a vector for later
allComparisons <- colnames(contrasts)
This next code chunk reorients the linear model obtained earlier and obtains the coefficients and standard errors from the model. This step also sets us up to apply Empirical Bayes smoothing.
# Running contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)
# Looking at N of DEGs with adj. p <0.01 and log2FC>2
summary(decideTests(vfit, adjust.method = "BH", p.value = 0.01, lfc = 2))
## Adjacent_vs_Tumor
## Down 509
## NotSig 12717
## Up 158
This code chunk uses Empirical Bayes smoothing and then plots the final model of after doing the limma and voom analysis.
Empirical Bayes smoothing is a way to account for uncertainty. The technique uses the population in a region as a measure of confidence. Meaning that areas with low margin of error are left untouched while estimates with higher margin of error are moved closer to the global average.
# Computing differential expression based on the empirical Bayes moderation of
# the standard errors towards a common value. Robust = should the estimation of
# the empirical Bayes prior parameters be robustified against outlier sample
# variances?
veBayesFit <- eBayes(vfit, robust=TRUE)
plotSA(veBayesFit, main = "Final model: Mean-variance trend")
vTopTable <- topTable(veBayesFit, n=Inf, p.value=1, lfc=0)
DEGs <- topTable(veBayesFit, n=Inf, p.value=0.01, lfc=2)
This code chunk involves setting up the variables and plot setting necessary to create a Volcano Plot of Tumor vs. Tumor-adjacent samples.
# ===========================================
#Volcano plot of tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable$adj.P.Val, vTopTable$logFC, vTopTable$chr, vTopTable$GENEID, vTopTable$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
This code chunk constructs the plot object of Tumor vs. Tumor adjacent volcano plot and customizes the plot. Volcano plots are a type of scatter that is used to quickly identify changes in large data sets. Volcano plots combine a statistical significan level with a magnitude of the change to allow quick viual identification of data points taht display a large maginitidue change that is statistically significant.
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 130)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This next code chunk adds specific significance thresholds to the volcano plot. The significance thresholds can be seen in the dotted lines.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_tumor_tumoradjacent_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 666 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
This code chunk shows the results of our DEG analysis. As well as retrieving specific data from a package called biomart. The combination of the DEG analysis results and the data from biomart is printed into a table.
The label “kegg-enzyme” was not valid so I took it out of the code and it seemed to run. I do not know if this changes anything. I left the original code commented out above.
# =================
# Enriched pathways
# =================
degResult_genes <- DEGs
degResult_genes$hgnc_symbol <- degResult_genes$name
ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
#biomart <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id', 'kegg_enzyme'), filters = 'hgnc_symbol', values = degResult_genes$gene_name, mart = ensembl)
biomart <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id'), filters = 'hgnc_symbol', values = degResult_genes$gene_name, mart = ensembl)
colnames(degResult_genes) <- c("chr", "start", "end", "TXNAME", "GENEID", "hgnc_symbol", "length",
"logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")
degResult_genes_biomart <- merge(degResult_genes, biomart, by="hgnc_symbol")
head(degResult_genes_biomart)
## hgnc_symbol chr start end TXNAME GENEID
## 1 AADAT chr4 170060221 170089967 ENST00000337664.8 ENSG00000109576.13
## 2 ACADL chr2 210187938 210225491 ENST00000233710.3 ENSG00000115361.7
## 3 ACSL4 chrX 109624243 109634686 ENST00000439581.1 ENSG00000068366.19
## 4 ACSM3 chr16 20610242 20775920 ENST00000568235.5 ENSG00000005187.11
## 5 ACTG2 chr2 73892313 73919675 ENST00000438902.6 ENSG00000163017.13
## 6 ADAMTS13 chr9 133414357 133459386 ENST00000485925.5 ENSG00000160323.18
## length logFC AveExpr t P.Value adj.P.Val B
## 1 29746 -2.433965 3.735729 -28.16478 3.930566e-92 1.143624e-89 199.58803
## 2 37553 -2.196072 3.405907 -17.29793 6.185261e-49 1.059968e-47 100.24402
## 3 10443 2.257095 5.515893 14.94782 1.892335e-39 1.936316e-38 78.39181
## 4 165678 -2.236025 5.326348 -25.01522 4.829054e-80 5.620179e-78 171.71969
## 5 27362 2.014695 0.185468 11.59107 1.580118e-26 7.826907e-26 49.27393
## 6 45029 -2.772271 4.003368 -40.31656 6.484082e-134 8.678296e-130 295.68054
## ensembl_gene_id entrezgene_id
## 1 ENSG00000109576 51166
## 2 ENSG00000115361 33
## 3 ENSG00000068366 2182
## 4 ENSG00000005187 6296
## 5 ENSG00000163017 72
## 6 ENSG00000281244 11093
degResult <- DEGs
degResult$hgnc_symbol <- degResult$gene_name
degResult_biomart <- merge(degResult, biomart, by="hgnc_symbol")
degResult_biomart <- degResult_biomart[complete.cases(degResult_biomart), ]
This code shows the dot plot of a GO enrichment analysis. GO enrichment analysis uses “Gene Ontology” system of classification to put genes into predefined “bins” allows high throughput experiments where certain data can be retrieved. GO is used to see which GO terms appear more frequently then by chance.
geneList <- degResult_biomart$entrezgene_id
geneList_quant <- degResult_biomart$logFC
ego <- enrichGO(geneList, OrgDb = "org.Hs.eg.db", ont="BP", readable=TRUE)
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/GO_KEGG Figures/dotplot_all_tumor_vs_tumor_adjacent.pdf", width=12, height=12)
dotplot(ego, showCategory=30)
#dev.off()
This code chunk is setting up a second design matrix for a voom analysis of tumor vs. non-tumor differentiated by sex.
# ===========================================================
# ===========================================================
# Analysis of tumor vs. non-tumor differentiated by sex
# ===========================================================
# ===========================================================
# ======================================
# voom transformation
# ======================================
# Creating a design model matrix with the variable of interest
#Added library type and Targets to the design matrix
#design <- model.matrix(~0+dge$samples$gender_tissue_viral)
design <- model.matrix(~0+v$targets$gender_tissue_viral+v$targets$library_type+v$targets$Ta)
identical(design,design,num.eq=TRUE)
## [1] TRUE
#Added target and library type column names
colnames(design) <- gsub("v\\$targets\\$gender_tissue_viral", "", colnames(design))
colnames(design) <- gsub("v\\$targets\\$library_typeunstranded", "library_type", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta2", "Ta2", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta3", "Ta3", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta4", "Ta4", colnames(design))
head(design)
## F_0_HBV F_0_HCV F_0_NBNC F_1_HBV F_1_HCV F_1_NBNC M_0_both M_0_HBV M_0_HCV
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0
## M_0_NBNC M_1_both M_1_HBV M_1_HCV M_1_NBNC library_type Ta2 Ta3 Ta4
## 1 1 0 0 0 0 1 1 0 0
## 2 0 0 0 0 1 1 1 0 0
## 3 0 0 0 0 0 1 0 0 1
## 4 0 0 0 1 0 1 0 0 1
## 5 0 0 0 0 0 1 1 0 0
## 6 0 0 0 1 0 1 1 0 0
head(design)
## F_0_HBV F_0_HCV F_0_NBNC F_1_HBV F_1_HCV F_1_NBNC M_0_both M_0_HBV M_0_HCV
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0
## M_0_NBNC M_1_both M_1_HBV M_1_HCV M_1_NBNC library_type Ta2 Ta3 Ta4
## 1 1 0 0 0 0 1 1 0 0
## 2 0 0 0 0 1 1 1 0 0
## 3 0 0 0 0 0 1 0 0 1
## 4 0 0 0 1 0 1 0 0 1
## 5 0 0 0 0 0 1 1 0 0
## 6 0 0 0 1 0 1 1 0 0
This code chunk is using the voom function and outputting the graph.
# Running voom with quality weights. Normalizes expression intensities so that
# the log-ratios have similar distributions across a set of samples.
# To quantile normalize, add normalize.method="quantile"
# Running parallel
v <- voomWithQualityWeights(dge, design, plot=TRUE)
This code chunk is for doing paired samples correlation tests. The value of the correlation should be positive.
# Block design for individual. This is used in tumor-normal comparisons with
# paired samples.
corfit <- duplicateCorrelation(v, design, block = v$targets$ID)
# This should give a positive correlation value. It represents the
# correlation between measurements made on the same person.
corfit$consensus
## [1] 0.1655235
This code chuck is doing another linear fit model with limma.
# Fitting the linear model with limma.
# If using paired samples, the within-patient correlation and a block design
# for patient is used to account for pairwise samples
fit <- lmFit(v, design, block = v$targets$ID, correlation = corfit$consensus)
This code chunk did not work so I chose arbitrary column headers from the design matrix for the pairwise comparisons. The previous comment is the original line of code. The code is intended to create a contrast design matrix for differential expression.
# Contrast design for differential expression
# Defining pairwise comparisons
#contrasts <- makeContrasts(Adjacent_M_vs_Tumor_M = M_1 - M_0,
#Adjacent_F_vs_Tumor_F = F_1 - F_0,
#levels=colnames(design))
#Couldn't make the pair wise comparisions would so I just picked some random column names.
contrasts <- makeContrasts(Adjacent_F_vs_Tumor_F = F_1_HBV - F_0_HBV,
Adjacement_M_vs_Tumor_M=M_1_both- M_0_both,
levels=colnames(design))
head(contrasts)
## Contrasts
## Levels Adjacent_F_vs_Tumor_F Adjacement_M_vs_Tumor_M
## F_0_HBV -1 0
## F_0_HCV 0 0
## F_0_NBNC 0 0
## F_1_HBV 1 0
## F_1_HCV 0 0
## F_1_NBNC 0 0
# Assigning all comparisons to a vector for later
allComparisons <- colnames(contrasts)
This code is running a contrast analysis using the data from the limma model.
# Running contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)
# Looking at N of DEGs with adj. p <0.01 and log2FC>2
summary(decideTests(vfit, adjust.method = "BH", p.value = 0.01, lfc = 2))
## Adjacent_F_vs_Tumor_F Adjacement_M_vs_Tumor_M
## Down 500 460
## NotSig 12615 12622
## Up 269 302
Bayes smoothing on data from the limma model. Empirical Bayes smoothing increases the power of the limma model.
# Computing differential expression based on the empirical Bayes moderation of
# the standard errors towards a common value. Robust = should the estimation of
# the empirical Bayes prior parameters be robustified against outlier sample
# variances?
veBayesFit <- eBayes(vfit, robust=TRUE)
plotSA(veBayesFit, main = "Final model: Mean-variance trend")
vTopTable_M <- topTable(veBayesFit, coef=1, n=Inf, p.value=1, lfc=0)
vTopTable_F <- topTable(veBayesFit, coef=2, n=Inf, p.value=1, lfc=0)
DEGs_M <- topTable(veBayesFit, coef=1, n=Inf, p.value=0.01, lfc=2)
DEGs_F <- topTable(veBayesFit, coef=2, n=Inf, p.value=0.01, lfc=2)
DEGs_F_relax_p <- topTable(veBayesFit, coef=2, n=Inf, p.value=0.1, lfc=2)
Data used to create volcano plot of tumor-tumor adjacent male sex.
# ===========================================
#Volcano plot of male tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable_M$adj.P.Val, vTopTable_M$logFC, vTopTable_M$chr, vTopTable_M$GENEID, vTopTable_M$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
Plot object for volcano plot. This volcano plot is comparing male tumor samples to male tumor adjacent samples.
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 120)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_male_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 768 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
This code chunk is developing the plot object for a volcano plot of female tumor vs. tumor adjacent samples.
# ===========================================
#Volcano plot of female tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable_F$adj.P.Val, vTopTable_F$logFC, vTopTable_F$chr, vTopTable_F$GENEID, vTopTable_F$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
This code chunk plots the female tumor vs. tumor adjacent volcano plot.
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 50)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This code chunk is setting the significance thresholds for the volcano plot. The thresholds can be seen via dotted lines on the graph.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_female_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 760 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
In this code chunk two Venn Diagrams are made. The commented out sections are the original code sections. The first two sections are creating the Venn Diagrams. Given that I have a different working directory and am doing this is an RMD file not an R file. I rewrote the code to create the Venn Diagrams and output them into the RMD file which you see below.
The third section is inputting data from a CSV file. I do not have the said CSV files so I have commented these sections out. These samples show the overlap between male and female samples.
library(VennDiagram)
#venn.diagram(List("Female"=DEGs_F$gene_name, "Male"=DEGs_M$gene_name), filename = "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Overlap_DEGs.png")
#venn.diagram(List("Female"=DEGs_F_relax_p$gene_name, "Male"=DEGs_M$gene_name), filename = "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Overlap_DEGs_Frelaxp.png")
#write.csv(DEGs_F, "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Female_all_DEGs.csv")
#write.csv(DEGs_M, "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Male_all_DEGs.csv")
write.csv(DEGs_F, "~/R/Female_DEGs.csv")
write.csv(DEGs_M, "~/R/Male_all_DEGs.csv")
venn1<- venn.diagram(List("Female"=DEGs_F$gene_name, "Male"=DEGs_M$gene_name),filename = NULL)
grid.newpage()
grid.draw(venn1)
venn2<-venn.diagram(List("Female"=DEGs_F_relax_p$gene_name, "Male"=DEGs_M$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn2)
The following three sections are DEG results taken from a couple of files that I do not have making these sections not work. Hence, they are all commented out.
# ===================================
# Enriched pathways males vs. females
# ===================================
#path <- "~/3.0 Hasting Research/Liver Cancer Analysis/DEGs/DEGs_newcounts_Salmon_batchMD1cov_fpkm05_HBVAdjacent_M_vs_HBVAdjacent_F_fdr1_lfc0.txt"
#Do not have appropriate files using old files for this
degResult_genes <- DEGs
degResult_genes$hgnc_symbol <- degResult_genes$name
ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
#biomart <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id', 'kegg_enzyme'), filters = 'hgnc_symbol', values = degResult_genes$gene_name, mart = ensembl)
#Deleted "kegg_enzyme" label beccause it does not work.
biomart <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol', 'entrezgene_id'), filters = 'hgnc_symbol', values = degResult_genes$gene_name, mart = ensembl)
colnames(degResult_genes) <- c("chr", "start", "end", "TXNAME", "GENEID", "hgnc_symbol", "length", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")
degResult_genes_biomart <- merge(degResult_genes, biomart, by="hgnc_symbol")
head(degResult_genes_biomart)
## hgnc_symbol chr start end TXNAME GENEID
## 1 AADAT chr4 170060221 170089967 ENST00000337664.8 ENSG00000109576.13
## 2 ACADL chr2 210187938 210225491 ENST00000233710.3 ENSG00000115361.7
## 3 ACSL4 chrX 109624243 109634686 ENST00000439581.1 ENSG00000068366.19
## 4 ACSM3 chr16 20610242 20775920 ENST00000568235.5 ENSG00000005187.11
## 5 ACTG2 chr2 73892313 73919675 ENST00000438902.6 ENSG00000163017.13
## 6 ADAMTS13 chr9 133414357 133459386 ENST00000485925.5 ENSG00000160323.18
## length logFC AveExpr t P.Value adj.P.Val B
## 1 29746 -2.433965 3.735729 -28.16478 3.930566e-92 1.143624e-89 199.58803
## 2 37553 -2.196072 3.405907 -17.29793 6.185261e-49 1.059968e-47 100.24402
## 3 10443 2.257095 5.515893 14.94782 1.892335e-39 1.936316e-38 78.39181
## 4 165678 -2.236025 5.326348 -25.01522 4.829054e-80 5.620179e-78 171.71969
## 5 27362 2.014695 0.185468 11.59107 1.580118e-26 7.826907e-26 49.27393
## 6 45029 -2.772271 4.003368 -40.31656 6.484082e-134 8.678296e-130 295.68054
## ensembl_gene_id entrezgene_id
## 1 ENSG00000109576 51166
## 2 ENSG00000115361 33
## 3 ENSG00000068366 2182
## 4 ENSG00000005187 6296
## 5 ENSG00000163017 72
## 6 ENSG00000281244 11093
degResult <- DEGs_F
degResult$hgnc_symbol <- degResult$gene_name
degResult_biomart <- merge(degResult, biomart, by="hgnc_symbol")
degResult_biomart <- degResult_biomart[complete.cases(degResult_biomart), ]
geneList <- degResult_biomart$entrezgene_id
geneList_quant <- degResult_biomart$logFC
ego <- enrichGO(geneList, OrgDb = "org.Hs.eg.db", ont="BP", readable=TRUE)
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/GO_KEGG Figures/Goplot_all_female.pdf", width=12, height=12)
goplot(ego)
#dev.off()
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/GO_KEGG Figures/dotplot_all_female.pdf", width=12, height=12)
dotplot(ego, showCategory=30)
#dev.off()
degResult <- DEGs_M
degResult$hgnc_symbol <- degResult$gene_name
degResult_biomart <- merge(degResult, biomart, by="hgnc_symbol")
degResult_biomart <- degResult_biomart[complete.cases(degResult_biomart), ]
geneList <- degResult_biomart$entrezgene_id
geneList_quant <- degResult_biomart$logFC
ego <- enrichGO(geneList, OrgDb = "org.Hs.eg.db", ont="BP", readable=TRUE)
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/GO_KEGG Figures/Goplot_all_male.pdf", width=12, height=12)
goplot(ego)
#dev.off()
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/GO_KEGG Figures/dotplot_all_male.pdf", width=12, height=12)
dotplot(ego, showCategory=30)
#dev.off()
This section is creating a design matrix and running the voom function on tumor non tumor adjacent samples that are separated by etiology.
# =========================================================================================================
# Differential expression analysis with limma - tumor adjacent vs. non-tumor-adjacent seperated by etiology
# =========================================================================================================
# Creating a new design model matrix with the variable of interest and the
# first dimension of the MDS
design <- model.matrix(~0+v$targets$gender_tissue_viral+v$targets$library_type+v$targets$Ta)
identical(design,design,num.eq=TRUE)
## [1] TRUE
#I just put design matrix as inputs twice because I didn't what inputs to put
identical(design,design,num.eq=TRUE)
## [1] TRUE
colnames(design) <- gsub("v\\$targets\\$gender_tissue_viral", "", colnames(design))
colnames(design) <- gsub("v\\$targets\\$library_typeunstranded", "library_type", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta2", "Ta2", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta3", "Ta3", colnames(design))
colnames(design) <- gsub("v\\$targets\\$Ta4", "Ta4", colnames(design))
head(design)
## F_0_HBV F_0_HCV F_0_NBNC F_1_HBV F_1_HCV F_1_NBNC M_0_both M_0_HBV M_0_HCV
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0
## M_0_NBNC M_1_both M_1_HBV M_1_HCV M_1_NBNC library_type Ta2 Ta3 Ta4
## 1 1 0 0 0 0 1 1 0 0
## 2 0 0 0 0 1 1 1 0 0
## 3 0 0 0 0 0 1 0 0 1
## 4 0 0 0 1 0 1 0 0 1
## 5 0 0 0 0 0 1 1 0 0
## 6 0 0 0 1 0 1 1 0 0
# Running voom again with the new design matrix.
v <- voomWithQualityWeights(dge, design, plot=TRUE)
This code chunk is doing a paired sample correlation test. The output should be a positive value.
# Block design for individual. This is used in tumor-normal comparisons with
# paired samples.
corfit <- duplicateCorrelation(v, design, block = v$targets$ID)
# This should give a positive correlation value. It represents the
# correlation between measurements made on the same person.
corfit$consensus
## [1] 0.1655235
This code chunk is making a linear model with the new design matrix created above.
# Fitting the linear model with limma.
# If using paired samples, the within-patient correlation and a block design
# for patient is used to account for pairwise samples
fit <- lmFit(v, design, block = v$targets$ID, correlation = corfit$consensus)
This code chunk is making a new design matrix.
# Contrast design for differential expression
# Defining pairwise comparisons
contrasts <- makeContrasts(HBVAdjacent_M_vs_HBVTumor_M = M_1_HBV - M_0_HBV,
HCVAdjacent_M_vs_HCVTumor_M = M_1_HCV - M_0_HCV,
NeitherAdjacent_M_vs_NeitherTumor_M = M_1_NBNC - M_0_NBNC,
HBVAdjacent_F_vs_HBVTumor_F = F_1_HBV - F_0_HBV,
HCVAdjacent_F_vs_HCVTumor_F = F_1_HCV - F_0_HCV,
NeitherAdjacent_F_vs_NeitherTumor_F = F_1_NBNC - F_0_NBNC,
levels=colnames(design))
head(contrasts)
## Contrasts
## Levels HBVAdjacent_M_vs_HBVTumor_M HCVAdjacent_M_vs_HCVTumor_M
## F_0_HBV 0 0
## F_0_HCV 0 0
## F_0_NBNC 0 0
## F_1_HBV 0 0
## F_1_HCV 0 0
## F_1_NBNC 0 0
## Contrasts
## Levels NeitherAdjacent_M_vs_NeitherTumor_M HBVAdjacent_F_vs_HBVTumor_F
## F_0_HBV 0 -1
## F_0_HCV 0 0
## F_0_NBNC 0 0
## F_1_HBV 0 1
## F_1_HCV 0 0
## F_1_NBNC 0 0
## Contrasts
## Levels HCVAdjacent_F_vs_HCVTumor_F NeitherAdjacent_F_vs_NeitherTumor_F
## F_0_HBV 0 0
## F_0_HCV -1 0
## F_0_NBNC 0 -1
## F_1_HBV 0 0
## F_1_HCV 1 0
## F_1_NBNC 0 1
This section is doing contrast analysis with the linear fit model. The output is put into table to show what is significant and what isn’t.
# Assigning all comparisons to a vector for later
allComparisons <- colnames(contrasts)
# Running contrast analysis
vfit <- contrasts.fit(fit, contrasts = contrasts)
# Looking at N of DEGs with adj. p <0.01 and log2FC>2
summary(decideTests(vfit, adjust.method = "BH", p.value = 0.01, lfc = 2))
## HBVAdjacent_M_vs_HBVTumor_M HCVAdjacent_M_vs_HCVTumor_M
## Down 612 578
## NotSig 12541 12666
## Up 231 140
## NeitherAdjacent_M_vs_NeitherTumor_M HBVAdjacent_F_vs_HBVTumor_F
## Down 552 500
## NotSig 12644 12615
## Up 188 269
## HCVAdjacent_F_vs_HCVTumor_F NeitherAdjacent_F_vs_NeitherTumor_F
## Down 434 142
## NotSig 12815 13109
## Up 135 133
This section is apply Bayes smoothing to the linear fit model to increase the power of the model.
# Computing differential expression based on the empirical Bayes moderation of
# the standard errors towards a common value. Robust = should the estimation of
# the empirical Bayes prior parameters be robustified against outlier sample
# variances?
veBayesFit <- eBayes(vfit, robust=TRUE)
plotSA(veBayesFit, main = "Final model: Mean-variance trend")
vTopTable_M_HBV <- topTable(veBayesFit, coef=1, n=Inf, p.value=1, lfc=0)
vTopTable_M_HCV <- topTable(veBayesFit, coef=2, n=Inf, p.value=1, lfc=0)
vTopTable_M_Neither <- topTable(veBayesFit, coef=3, n=Inf, p.value=1, lfc=0)
vTopTable_F_HBV <- topTable(veBayesFit, coef=4, n=Inf, p.value=1, lfc=0)
vTopTable_F_HCV <- topTable(veBayesFit, coef=5, n=Inf, p.value=1, lfc=0)
vTopTable_F_Neither <- topTable(veBayesFit, coef=6, n=Inf, p.value=1, lfc=0)
DEGs_M_HBV <- topTable(veBayesFit, coef=1, n=Inf, p.value=0.01, lfc=2)
DEGs_M_HCV <- topTable(veBayesFit, coef=2, n=Inf, p.value=0.01, lfc=2)
DEGs_M_Neither <- topTable(veBayesFit, coef=3, n=Inf, p.value=0.01, lfc=2)
DEGs_F_HBV <- topTable(veBayesFit, coef=4, n=Inf, p.value=0.01, lfc=2)
DEGs_F_HBV_relax_p <- topTable(veBayesFit, coef=4, n=Inf, p.value=0.1, lfc=2)
DEGs_F_HCV <- topTable(veBayesFit, coef=5, n=Inf, p.value=0.01, lfc=2)
DEGs_F_HCV_relax_p <- topTable(veBayesFit, coef=5, n=Inf, p.value=0.1, lfc=2)
DEGs_F_Neither <- topTable(veBayesFit, coef=6, n=Inf, p.value=0.01, lfc=2)
DEGs_F_Neither_relax_p <- topTable(veBayesFit, coef=6, n=Inf, p.value=0.1, lfc=2)
This section is creating the plot object fora volcano plot of male HBV tumor and tumor adjacent samples.
# ===============================================
#Volcano plot of male HBV tumor vs tumor-adjacent
# ===============================================
df <- data.frame(vTopTable_M_HBV$adj.P.Val, vTopTable_M_HBV$logFC, vTopTable_M_HBV$chr, vTopTable_M_HBV$GENEID, vTopTable_M_HBV$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
This section plots the male HBV tumor vs. tumor adjacent samples.
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 50)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This section applies significance thresholds to the volcano plot created above. The thresholds can be seen via dotted lines on the graph.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_male_HBV_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 841 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
This section creates a plot object and graphs a volcano plot of female HBV tumor vs. tumor adjacent samples.
# ===========================================
#Volcano plot of female HBV tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable_F_HBV$adj.P.Val, vTopTable_F_HBV$logFC, vTopTable_F_HBV$chr, vTopTable_F_HBV$GENEID, vTopTable_F_HBV$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 30)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This section creates significance level for the plot created above. The significance levels can be seen by dotted lines on the graph.
This section also creates Venn Diagrams that show the overlap of male and female HBV samples.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_female_HBV_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 767 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
library(VennDiagram)
venn3<-venn.diagram(List("Female"=DEGs_F_HBV$gene_name, "Male"=DEGs_M_HBV$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn3)
venn4<-venn.diagram(List("Female"=DEGs_F_HBV_relax_p$gene_name, "Male"=DEGs_M_HBV$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn4)
This section creates and outputs a volcano plot of male HCV tumor and tumor adjacent samples.
# ===============================================
#Volcano plot of male HCV tumor vs tumor-adjacent
# ===============================================
df <- data.frame(vTopTable_M_HCV$adj.P.Val, vTopTable_M_HCV$logFC, vTopTable_M_HCV$chr, vTopTable_M_HCV$GENEID, vTopTable_M_HCV$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 70)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This section creates the significance levels for the plot created above. The ouput can be seen in the dotted lines on the graph.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_male_HCV_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 713 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
This section creates and ouputs a volcano plot of female HCV tumor tumor adjacent samples.
# ===========================================
#Volcano plot of female HCV tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable_F_HCV$adj.P.Val, vTopTable_F_HCV$logFC, vTopTable_F_HCV$chr, vTopTable_F_HCV$GENEID, vTopTable_F_HCV$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 40)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
This section creates significance levels and venn diagrams. The significance levels are for hte volcano plot created above and can be seen as the dotted lines on the plot below.
The Venn Diagrams are to show the overlap in male and female HCV samples.
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_female_HCV_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 567 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
library(VennDiagram)
#venn.diagram(List("Female"=DEGs_F_HCV$gene_name, "Male"=DEGs_M_HCV$gene_name), filename = "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Overlap_DEGs_HCV.png")
#venn.diagram(List("Female"=DEGs_F_HCV_relax_p$gene_name, "Male"=DEGs_M_HCV$gene_name), filename = "~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Overlap_DEGs_HCV_Frelaxp.png")
venn5<-venn.diagram(List("Female"=DEGs_F_HCV$gene_name, "Male"=DEGs_M_HCV$gene_name), filename=NULL)
grid.newpage()
grid.draw(venn5)
venn6<-venn.diagram(List("Female"=DEGs_F_HCV_relax_p$gene_name, "Male"=DEGs_M_HCV$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn6)
This section create, plots, and adds significance thresholds for a volcano plot of male non-viral vs. tumor adjacent samples. This significance thresholds can be seen via the dotted lines on the graph.
# ===============================================
#Volcano plot of male non-viral tumor vs tumor-adjacent
# ===============================================
df <- data.frame(vTopTable_M_Neither$adj.P.Val, vTopTable_M_Neither$logFC, vTopTable_M_Neither$chr, vTopTable_M_Neither$GENEID, vTopTable_M_Neither$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 40)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_male_Neither_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 733 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
This sections creates, plots, adds significance thresholds and creats venn diagrams. The plot is a volcano plot of female nonviral tumors vs. tumor adjacent samples. The significan thresholds can be seen via the dotted lines on the graph.
The Venn Diagram is showing the overlap between male and female non-viral tumors.
# ===========================================
#Volcano plot of female non-viral tumor vs tumor-adjacent
# ===========================================
df <- data.frame(vTopTable_F_Neither$adj.P.Val, vTopTable_F_Neither$logFC, vTopTable_F_Neither$chr, vTopTable_F_Neither$GENEID, vTopTable_F_Neither$gene_name)
colnames(df) <- c("adj.P.Val", "logFC", "chr", "id", "name")
dfSig <- df[(abs(df$logFC) >= 2 & df$adj.P.Val <= 0.01),]$id
dfAnons <- subset(df, chr != "chrX" & chr != "chrY" & !(id %in% dfSig))
dfAnons <- cbind(dfAnons , rep(1, nrow(dfAnons)))
colnames(dfAnons)[6] <- "Color"
dfXnons <- subset(df, chr == "chrX" & !(id %in% dfSig))
dfXnons <- cbind(dfXnons, rep(2, nrow(dfXnons)))
colnames(dfXnons)[6] <- "Color"
dfYnons <- subset(df, chr == "chrY" & !(id %in% dfSig))
dfYnons <- cbind(dfYnons, rep(3, nrow(dfYnons)))
colnames(dfYnons)[6] <- "Color"
dfA <- subset(df, chr != "chrX" & chr != "chrY" & id %in% dfSig)
dfA <- cbind(dfA, rep(4, nrow(dfA)))
colnames(dfA)[6] <- "Color"
dfX <- subset(df, chr == "chrX" & id %in% dfSig)
dfX <- cbind(dfX, rep(5, nrow(dfX)))
colnames(dfX)[6] <- "Color"
dfY <- subset(df, chr == "chrY" & id %in% dfSig)
dfY <- cbind(dfY, rep(6, nrow(dfY)))
colnames(dfY)[6] <- "Color"
dfPlot <- rbind(dfAnons, dfXnons, dfYnons, dfA, dfX, dfY)
dfPlot$Color <- as.factor(dfPlot$Color)
# Constructing the plot object. The colors will not work the way they should if
# one of the groups doesn't have any genes
p <- ggplot(data = dfPlot, aes(x = logFC, y = -log10(adj.P.Val), color=Color )) +
geom_point(alpha = 0.5, size = 8) +
theme_bw() +
theme(legend.position = "none") +
xlim(c(-15, 15)) + ylim(c(0, 7.5)) +
scale_color_manual(values = c("azure3", "pink", "seagreen2", "black", "mediumvioletred", "springgreen")) +
labs(x=expression(log[2](FC)),
y=expression(-log[10] ~ "(FDR-adjusted " ~ italic("p") ~ "-value)")) +
theme(axis.title.x=element_text(size=12),
axis.text.x=element_text(size=10)) +
theme(axis.title.y=element_text(size=12),
axis.text.y=element_text(size=10))
p
forLabel <- subset(dfPlot, adj.P.Val<=0.01 & abs(logFC)>=2)
# Adding lines for significance thresholds
#pdf("~/3.0 Hasting Research/Liver Cancer Analysis/DEG Figures/Volcano_plot_female_Neither_DEGs.pdf", width=12, height=12)
p + geom_hline(yintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = 2, colour="#000000", linetype="dashed"
) + geom_vline(xintercept = -2, colour="#000000", linetype="dashed"
) + geom_text_repel(data=forLabel, max.iter=1000, box.padding = 0.25, force=1, aes(x = logFC, y = -log10(adj.P.Val), label=name, color=Color), size=8)
## Warning: ggrepel: 266 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#dev.off()
library(VennDiagram)
venn7<- venn.diagram(List("Female"=DEGs_F_Neither$gene_name, "Male"=DEGs_M_Neither$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn7)
venn8<- venn.diagram(List("Female"=DEGs_F_Neither_relax_p$gene_name, "Male"=DEGs_M_Neither$gene_name), filename = NULL)
grid.newpage()
grid.draw(venn8)